* This file prepares the main dataset from SITC 4-digit bilateral trade data from Comtrade.
* NOTE: this version rectangularises the data before merging in FTA information.
* NOTE: this version also drops duplicates before rectangularising
cls

version 16
global path "C:\..."
global rawdata "$path\raw data"
* dtapath contains the path to the folder with intermediate Stata data files
global dtapath "$path\dta-files"
capture mkdir "$dtapath"
*resulspath contain the path to the folder where all output from this do-file will be saved
global resultspath "$path\results"
capture mkdir "$resultspath"
cap log close
log using "$resultspath\Data_preparation", replace
cd "$dtapath"

global mirror="no"  // use mirror flows to impute missing imports?

* prepare FTA data from Baier & Bergstrand (2007, JIE) for later merge
use "$rawdata\data_fta_fixed", clear	
replace iso_o = "ZAR" if iso_o=="COD"  // Changing iso codes to fit the Comtrade data (Democratic Rep of Congo and Romania were different) 
replace iso_o = "ROM" if iso_o=="ROU"
replace iso_d = "ZAR" if iso_d=="COD"
replace iso_d = "ROM" if iso_d=="ROU"
recode year (1960=1962)
ren iso_o exporteriso3
ren iso_d importeriso3
drop importer exporter
save tempFTABB.dta, replace

* file with country names (needed to merge in cepii data later on)
import excel "$rawdata\BB-countrylist.xlsx", sheet("Sheet1") firstrow clear
gen reporter=strtrim(NameComtrade)
drop NameComtrade
bys reporter: keep if _n==1
save BB-countrylist_rep.dta, replace
ren reporter partner
save BB-countrylist_par.dta, replace

* trade data (UN Comtrade, SITC Rev 1)
foreach x in 62-65 70-74 75-78 79-82 83-86 87-90 94-96 99-00 01-02 03-04 05 06 07 08 09-10 {
	import delimited "$rawdata\ComtradeBatch_1962-2010\ComtradeBatch_`x'.csv", clear encoding(ISO-8859-1)
		
	keep if length(commoditycode)==7  // work at the sub-group (4-digit) level of SITC b/c not all sub-groups are further subdivided into basic headings (5 digit)
	keep period reporter partner commodity* tradevalue tradeflow
	*keep if tradeflow=="Export"   // this seems to exclude re-exports (see WITS user manual, p.38; gross exports = exports + re-exports)
	drop if tradeflow=="Re-Export" | tradeflow=="Re-Import"
	foreach name in reporter partner {
		replace `name'="USA" if `name'=="USA (before 1981)"
		replace `name'="Germany" if `name'=="Fmr Fed. Rep. of Germany"
		replace `name'="India" if `name'=="India, excl. Sikkim"
		replace `name'="Ethiopia" if `name'=="Fmr Ethiopia"
		replace `name'="Belgium-Luxembourg" if `name'=="Belgium"
		replace `name'="Belgium-Luxembourg" if `name'=="Luxembourg"
	}
	if "`x'"=="99-00" | "`x'"=="01-02" | "`x'"=="03-04" | "`x'"=="05" | "`x'"=="06" | "`x'"=="07" | "`x'"=="08" | "`x'"=="09-10" {
		collapse (sum) tradevalue, by( period reporter partner commoditycode commoditydescription tradeflow)  // merge Belgium and Luxembourg for last years in data
		drop if reporter==partner
	}
	
	merge n:1 reporter using BB-countrylist_rep.dta  
	keep if _merge==3   // we only want to keep the 96 countries present in the B&B(2007) sample
	replace reporter=NameCepii
	drop _merge NameCepii
	merge n:1 partner using BB-countrylist_par.dta
	keep if _merge==3
	replace partner=NameCepii
	drop _merge NameCepii
	drop Original Alphabetic
	tab tradeflow  // make sure only exports and imports left
	* use mirror flows to impute missing imports 
	gen mirrorflow=0
	label var mirrorflow "1: imports imputed from mirror export flow"
	if "$mirror"=="yes" {
		preserve
		keep if tradeflow=="Export"
		keep reporter partner period commoditycode commoditydescription tradevalue
		ren tradevalue exportvalue
		ren reporter temp
		ren partner reporter
		ren temp partner
		save tempexp.dta, replace
		restore
		keep if tradeflow=="Import"
		merge 1:1 reporter partner period commoditycode using tempexp.dta
		corr tradevalue exportvalue   // check correlation high if both mirror flows present
		drop _merge
		replace mirrorflow=1 if tradevalue==. & exportvalue~=.
		replace tradevalue=exportvalue if tradevalue==. & exportvalue~=.
		drop exportvalue
		erase tempexp.dta
	}
	ren tradevalue imports
	drop tradeflow 
	save temp`x'.dta, replace
	
}
	
use temp62-65.dta, replace
foreach x in 70-74 75-78 79-82 83-86 87-90 94-96 99-00 01-02 03-04 05 06 07 08 09-10 {	
	append using temp`x'.dta
}
ren period year
keep if year==1962 | year==1965 | year==1970 | year==1975 | year==1980 | year==1985 | year==1990 | year==1995 | year==2000 | year==2005 | year==2010
codebook reporter partner // check we get all 96 countries in B&B 
sort reporter partner commoditycode year
compress
save BB_allyears.dta, replace
foreach x in 62-65 70-74 75-78 79-82 83-86 87-90 94-96 99-00 01-02 03-04 05 06 07 08 09-10 {
	erase temp`x'.dta
}
for any BB-countrylist_par BB-countrylist_rep: erase X.dta

* numerous duplicates present in BB_allyears.dta; Already seem to be there in the
* original data. For now, simply take averages to avoid problems with fillin below
collapse (mean) imports, by(reporter partner commoditycode commoditydescription year) 
save BB_allyears_noduplicates.dta, replace


* merge in cepii identifiers and rectangularise data
use "$rawdata\geo_cepii.dta", replace
replace country="Cote d'Ivoire" if iso3=="CIV"
bys iso3: keep if _n==1
ren country reporter
save tempgeo_rep.dta, replace
ren reporter partner
save tempgeo_par.dta, replace

use BB_allyears_noduplicates.dta, replace
merge n:1 reporter using tempgeo_rep.dta, keepusing(iso3)
ren iso3 importeriso3
drop if _merge==2
drop _merge
merge n:1 partner using tempgeo_par.dta, keepusing(iso3)
ren iso3 exporteriso3
drop if _merge==2
drop _merge

encode commoditydescription, gen(commodityname)
keep exporteriso3 importeriso3 year commoditycode commodityname imports 


* Option 1: rectangularisation using fillin
fillin exporteriso3 importeriso3 year commoditycode
replace imports=0 if _fillin==1
drop if exporteriso3==importeriso3
save BB_allyears_zeros.dta, replace
* check that fillin generates an equal number of products per country pair
egen pair=concat(exporteriso3 importeriso3)
egen sic_count=count(commoditycode), by(pair)
tab sic_count  // should be 625 products*11 years = 6875

replace imports=0 if _fillin==1
drop if exporteriso3==importeriso3
save BB_allyears_zeros.dta, replace 
for any geo_par geo_rep: erase tempX.dta


/* Option 2: use cross command for rectangularisation if fillin doesn't work properly
preserve
bys exporteriso3: keep if _n==1
keep exporteriso3
save BB_exp_iso3.dta, replace
restore
preserve
bys importeriso3: keep if _n==1
keep importeriso3
save BB_imp_iso3.dta, replace
restore
preserve
bys year: keep if _n==1
keep year
save BB_years.dta, replace
restore
preserve
bys commoditycode: keep if _n==1
keep commoditycode
save BB_productcodes.dta, replace

use BB_exp_iso3.dta, replace
cross using BB_imp_iso3
count // dataset has all possible country combinations from B&B (2007), yielding
* 96*96 = 9,216 obs.
cross using BB_productcodes.dta
count // dataset now has all 625 product codes for each country pair, yielding
*96*96*625 = 5,760,000 obs.
cross using BB_years  // finally add all 11 years for each country-pair-product 
*combinations, yielding 96*96*625*11=63,360,000 obs
save BB_rect.dta, replace
merge 1:1 exporteriso3 importeriso3 commoditycode year using tempdata.dta
*/


* merge in gravity variables (CEPII)
use "$rawdata\dist_cepii.dta", replace
ren iso_o exporteriso3
ren iso_d importeriso3
save tempdist.dta, replace
use BB_allyears_zeros.dta, replace  
merge n:1 importeriso3 exporteriso3 using tempdist.dta
keep if _merge==3
drop _merge
keep exporteriso3 importeriso3 year commoditycode commodityname imports comlang_off comcol curcol col45 distw _fillin
erase tempdist.dta


* FTA variable from Limao (2016)
for any importeriso3 exporteriso3: replace X="SUD" if X=="SDN"   // code for Sudan
merge n:1 importeriso3 exporteriso3 year using "$rawdata\limao_handbook_pta.dta", keepusing(EIA gatt_year_i wto_year_i gatt_year_j wto_year_j)
drop if _merge==2
drop _merge
gen WTO_j = (wto_year_j <= year | gatt_year_j <= year) // the following 13 lines are from Limao's do file
gen WTO_i = (wto_year_i <= year | gatt_year_i <= year) 
gen WTO 	 = (WTO_j == 1 & WTO_i == 1) 
drop wto_year_j wto_year_i gatt*
tab EIA if EIA != -999, gen(EIA_) // generate 7 dummies for EIA 0 to 6 (see Limao, p.286)
rename EIA_1 No_Agreem
rename EIA_2 NRPTA
rename EIA_3 RPTA   
rename EIA_4 FTA
rename EIA_5 CU
rename EIA_6 CM
rename EIA_7 EU
gen PTA = RPTA+FTA+CU+CM+EU 

* FTA variable from Baier & Bergstrand (2007, JIE)
for any importeriso3 exporteriso3: replace X="SDN" if X=="SUD"  
merge n:1 importeriso3 exporteriso3 year using tempFTABB.dta // _merge==1 are years 2005 and 2010 which are not in B&B
drop _merge
label var fta "Trade agreements coded from Baier and Bergstrand (2007, JIE), Table 3"
label var scott_fta "Trade agreements from Baier and Bergstrand (2007, JIE) as sent by Scott Baier"
erase tempFTABB.dta
compress



* save data
if "$mirror"=="yes" {
	save BB_allyears_all_zeros.dta, replace
}
else {
	save BB_allyears_all_zeros_nomirror.dta, replace
}
drop if _fillin==1
drop _fillin
if "$mirror"=="yes" {
	save BB_allyears_all_nozeros.dta, replace
}
else {
	save BB_allyears_all_nozeros_nomirror.dta, replace
}

erase BB_allyears.dta 
erase BB_allyears_noduplicates.dta 
erase BB_allyears_zeros.dta

log close
